import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.io as pio
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import zscore, skew, kurtosis
import os
import warnings
warnings.filterwarnings("ignore")
os.chdir("C:/Users/cwhwe/Desktop/May_2021/Data_Analytics/Assignment")
df = pd.read_csv("ai4i2020.csv", index_col = 0)
df.head()
| Product ID | Type | Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | Machine failure | TWF | HDF | PWF | OSF | RNF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UDI | |||||||||||||
| 1 | M14860 | M | 298.1 | 308.6 | 1551 | 42.8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | L47181 | L | 298.2 | 308.7 | 1408 | 46.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | L47182 | L | 298.1 | 308.5 | 1498 | 49.4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | L47183 | L | 298.2 | 308.6 | 1433 | 39.5 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | L47184 | L | 298.2 | 308.7 | 1408 | 40.0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 |
df.tail()
| Product ID | Type | Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | Machine failure | TWF | HDF | PWF | OSF | RNF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| UDI | |||||||||||||
| 9996 | M24855 | M | 298.8 | 308.4 | 1604 | 29.5 | 14 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9997 | H39410 | H | 298.9 | 308.4 | 1632 | 31.8 | 17 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9998 | M24857 | M | 299.0 | 308.6 | 1645 | 33.4 | 22 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9999 | H39412 | H | 299.0 | 308.7 | 1408 | 48.5 | 25 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10000 | M24859 | M | 299.0 | 308.7 | 1500 | 40.2 | 30 | 0 | 0 | 0 | 0 | 0 | 0 |
col_bin = ["Machine failure", "TWF", "HDF", "PWF", "OSF", "RNF"]
col_cat = ["Type"]
col_flt = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]
col_obj = ["Product ID"]
col_ = col_bin, col_cat, col_flt, col_obj
binary_ = ["Feature: {} has values {}".format(i, df[i].unique()) for i in col_bin]
category_ = "Feature: {} has values {}".format(col_cat[0], df[col_cat[0]].unique())
binary_, category_
(['Feature: Machine failure has values [0 1]', 'Feature: TWF has values [0 1]', 'Feature: HDF has values [0 1]', 'Feature: PWF has values [0 1]', 'Feature: OSF has values [0 1]', 'Feature: RNF has values [0 1]'], "Feature: Type has values ['M' 'L' 'H']")
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 1 to 10000 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Product ID 10000 non-null object 1 Type 10000 non-null object 2 Air temperature [K] 10000 non-null float64 3 Process temperature [K] 10000 non-null float64 4 Rotational speed [rpm] 10000 non-null int64 5 Torque [Nm] 10000 non-null float64 6 Tool wear [min] 10000 non-null int64 7 Machine failure 10000 non-null int64 8 TWF 10000 non-null int64 9 HDF 10000 non-null int64 10 PWF 10000 non-null int64 11 OSF 10000 non-null int64 12 RNF 10000 non-null int64 dtypes: float64(3), int64(8), object(2) memory usage: 1.1+ MB
df["Type"] = df["Type"].astype("category")
for i in col_bin:
df[i] = df[i].astype("int8")
df.info()
print("memory usage is reduced by 0.4MB")
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 1 to 10000 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Product ID 10000 non-null object 1 Type 10000 non-null category 2 Air temperature [K] 10000 non-null float64 3 Process temperature [K] 10000 non-null float64 4 Rotational speed [rpm] 10000 non-null int64 5 Torque [Nm] 10000 non-null float64 6 Tool wear [min] 10000 non-null int64 7 Machine failure 10000 non-null int8 8 TWF 10000 non-null int8 9 HDF 10000 non-null int8 10 PWF 10000 non-null int8 11 OSF 10000 non-null int8 12 RNF 10000 non-null int8 dtypes: category(1), float64(3), int64(2), int8(6), object(1) memory usage: 615.4+ KB memory usage is reduced by 0.4MB
print(df.isnull().sum())
Product ID 0 Type 0 Air temperature [K] 0 Process temperature [K] 0 Rotational speed [rpm] 0 Torque [Nm] 0 Tool wear [min] 0 Machine failure 0 TWF 0 HDF 0 PWF 0 OSF 0 RNF 0 dtype: int64
round(df[col_flt].describe(),4)
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | |
|---|---|---|---|---|---|
| count | 10000.0000 | 10000.0000 | 10000.0000 | 10000.0000 | 10000.0000 |
| mean | 300.0049 | 310.0056 | 1538.7761 | 39.9869 | 107.9510 |
| std | 2.0003 | 1.4837 | 179.2841 | 9.9689 | 63.6541 |
| min | 295.3000 | 305.7000 | 1168.0000 | 3.8000 | 0.0000 |
| 25% | 298.3000 | 308.8000 | 1423.0000 | 33.2000 | 53.0000 |
| 50% | 300.1000 | 310.1000 | 1503.0000 | 40.1000 | 108.0000 |
| 75% | 301.5000 | 311.1000 | 1612.0000 | 46.8000 | 162.0000 |
| max | 304.5000 | 313.8000 | 2886.0000 | 76.6000 | 253.0000 |
def create_df(x,y):
return pd.DataFrame({
"x":x,
"y":y
})
temp_ = create_df(df.Type.unique(), df.Type.value_counts())
bar_plot_1 = px.bar(temp_, x="x", y="y", width=600, height=350)
bar_plot_1.update_layout(xaxis_title="Type", yaxis_title="Frequency", \
title="AI4I 2020 Predictive Maintenance Dataset Data Set", \
margin=dict(l=30, r=30, t=50, b=30),
paper_bgcolor="LightSteelBlue")
bar_plot_1.show()
hist_plot = make_subplots(rows=len(col_flt), cols=1, subplot_titles = ["Frequency"]*5)
for i, j in enumerate(col_flt):
hist_plot.add_trace(
go.Histogram(x=df[j], name = j),
row=i+1, col=1
)
hist_plot.update_layout(xaxis_title=j, yaxis_title="Frequency", \
title="AI4I 2020 Predictive Maintenance Dataset Data Set", \
margin=dict(l=30, r=30, t=100, b=10),
paper_bgcolor="LightSteelBlue",
height=1200, width=950)
hist_plot.show()
df[col_flt].corr()
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | |
|---|---|---|---|---|---|
| Air temperature [K] | 1.000000 | 0.876107 | 0.022670 | -0.013778 | 0.013853 |
| Process temperature [K] | 0.876107 | 1.000000 | 0.019277 | -0.014061 | 0.013488 |
| Rotational speed [rpm] | 0.022670 | 0.019277 | 1.000000 | -0.875027 | 0.000223 |
| Torque [Nm] | -0.013778 | -0.014061 | -0.875027 | 1.000000 | -0.003093 |
| Tool wear [min] | 0.013853 | 0.013488 | 0.000223 | -0.003093 | 1.000000 |
sns.heatmap(df[col_flt].corr())
<AxesSubplot:>
sns.pairplot(df[col_flt].sample(750))
<seaborn.axisgrid.PairGrid at 0x162ac1f2070>
skewness = [round(skew(df[i]),3) for i in col_flt]
kurtosis = [round(kurtosis(df[i]),3) for i in col_flt]
skew_kurt = list(zip(col_flt, skewness, kurtosis))
skew_kurt
[('Air temperature [K]', 0.114, -0.836),
('Process temperature [K]', 0.015, -0.5),
('Rotational speed [rpm]', 1.993, 7.389),
('Torque [Nm]', -0.01, -0.014),
('Tool wear [min]', 0.027, -1.167)]
print("Normal Distribution Rules: Skewness should be in between -2 and 2 while Kurtosis should be in between -7 and 7. \n")
for number ,(i, j, k) in enumerate(skew_kurt):
print("{}) {} has skewness of {} and kurtosis of {}".format(number+1, i,j,k))
Normal Distribution Rules: Skewness should be in between -2 and 2 while Kurtosis should be in between -7 and 7. 1) Air temperature [K] has skewness of 0.114 and kurtosis of -0.836 2) Process temperature [K] has skewness of 0.015 and kurtosis of -0.5 3) Rotational speed [rpm] has skewness of 1.993 and kurtosis of 7.389 4) Torque [Nm] has skewness of -0.01 and kurtosis of -0.014 5) Tool wear [min] has skewness of 0.027 and kurtosis of -1.167
def machine_failure_by_type(df, type_, mf, *func):
"""
Parameters: dataframe, column, column, list of functions
"""
df_ = round(df.groupby(type_)[mf].agg(*func)*100,4) #percentage of machine failure
df_.columns = ["Machine Failure (%)"]
return df_
def print_machile_failure_by_type(df):
"""
Notice: this function can only be called after machine_failure_by_type
Parameters: dataframe
"""
dict_ = {
"H": "High Product Quality",
"L": "Low Product Quality",
"M": "Medium Product Quality"
}
for i, j in list(zip(list(df.index), list(df.values))):
print("The Machine Failure for {} is {}%".format(dict_[i], j))
df_machine_failure_by_type = machine_failure_by_type(df, "Type", "Machine failure", "mean")
print_machile_failure_by_type(df_machine_failure_by_type)
The Machine Failure for High Product Quality is 2.0937% The Machine Failure for Low Product Quality is 3.9167% The Machine Failure for Medium Product Quality is 2.7694%
Pivot_table_bin = []
for i in col_bin:
Pivot_table_bin.append(round(df.groupby(i)[col_flt[0], col_flt[1], col_flt[2], col_flt[3], col_flt[4]].\
agg(["mean", "std"]),4))
Pivot_table_bin[0]
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| Machine failure | ||||||||||
| 0 | 299.9740 | 1.9907 | 309.9956 | 1.4868 | 1540.2600 | 167.3947 | 39.6297 | 9.4721 | 106.6937 | 62.9458 |
| 1 | 300.8864 | 2.0715 | 310.2903 | 1.3637 | 1496.4867 | 384.9435 | 50.1681 | 16.3745 | 143.7817 | 72.7599 |
Pivot_table_bin[1]
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| TWF | ||||||||||
| 0 | 300.0036 | 2.0005 | 310.0048 | 1.4838 | 1538.6495 | 179.1547 | 39.9968 | 9.9669 | 107.4500 | 63.3666 |
| 1 | 300.2978 | 1.9460 | 310.1652 | 1.4840 | 1566.1739 | 205.7685 | 37.8370 | 10.2772 | 216.3696 | 12.2572 |
Pivot_table_bin[2]
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| HDF | ||||||||||
| 0 | 299.9752 | 1.9916 | 309.9964 | 1.4883 | 1541.1205 | 178.9549 | 39.8336 | 9.9017 | 107.9598 | 63.6576 |
| 1 | 302.5609 | 0.6019 | 310.7887 | 0.6445 | 1337.2609 | 34.7460 | 53.1670 | 6.2235 | 107.1913 | 63.6293 |
Pivot_table_bin[3]
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| PWF | ||||||||||
| 0 | 300.0043 | 1.9989 | 310.0060 | 1.4826 | 1536.6163 | 168.2316 | 39.9051 | 9.6342 | 108.0092 | 63.6479 |
| 1 | 300.0758 | 2.1471 | 309.9547 | 1.6004 | 1763.9684 | 620.8291 | 48.5147 | 26.7887 | 101.8842 | 64.3557 |
Pivot_table_bin[4]
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| OSF | ||||||||||
| 0 | 300.0045 | 2.0001 | 310.0049 | 1.4835 | 1540.6412 | 179.0787 | 39.8050 | 9.8305 | 106.9638 | 63.1669 |
| 1 | 300.0449 | 2.0286 | 310.0735 | 1.5110 | 1350.3265 | 61.2508 | 58.3704 | 5.9436 | 207.6939 | 15.8110 |
Pivot_table_bin[5]
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| RNF | ||||||||||
| 0 | 300.0034 | 2.0005 | 310.0041 | 1.4835 | 1538.8785 | 179.3791 | 39.9799 | 9.9670 | 107.9195 | 63.6421 |
| 1 | 300.8158 | 1.7076 | 310.7632 | 1.4595 | 1485.0000 | 109.4390 | 43.6737 | 10.5759 | 124.4737 | 69.5504 |
round(df.groupby("Type")[col_flt[0], col_flt[1], col_flt[2], col_flt[3], col_flt[4]].agg(["mean", "std"]), 4)
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | ||||||
|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | |
| Type | ||||||||||
| H | 299.8670 | 2.0218 | 309.9257 | 1.4894 | 1538.1476 | 173.1334 | 39.8383 | 9.6423 | 107.4197 | 63.0801 |
| L | 300.0158 | 1.9875 | 310.0123 | 1.4752 | 1539.4692 | 180.4285 | 39.9966 | 10.0123 | 108.3788 | 64.0582 |
| M | 300.0293 | 2.0174 | 310.0188 | 1.4984 | 1537.5989 | 179.0598 | 40.0173 | 9.9922 | 107.2723 | 63.0446 |
round(df.groupby("Type")[col_bin[0], col_bin[1], col_bin[2], col_bin[3], col_bin[4], col_bin[5]]\
.agg([np.mean, np.std]), 4)
| Machine failure | TWF | HDF | PWF | OSF | RNF | |||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | |
| Type | ||||||||||||
| H | 0.0209 | 0.1432 | 0.0070 | 0.0833 | 0.0080 | 0.0890 | 0.0050 | 0.0705 | 0.0020 | 0.0446 | 0.0040 | 0.0631 |
| L | 0.0392 | 0.1940 | 0.0042 | 0.0644 | 0.0127 | 0.1118 | 0.0098 | 0.0987 | 0.0145 | 0.1195 | 0.0022 | 0.0465 |
| M | 0.0277 | 0.1641 | 0.0047 | 0.0682 | 0.0103 | 0.1012 | 0.0103 | 0.1012 | 0.0030 | 0.0547 | 0.0007 | 0.0258 |
df_pie = []
for i in col_bin:
df_temp = df.groupby(i)["Product ID"].agg(["count"])
df_temp["percentage"] = (df_temp["count"] / df_temp["count"].sum())*100
df_pie.append(df_temp)
df_pie[0].index
Int64Index([0, 1], dtype='int64', name='Machine failure')
multi_index = []
binary_ = []
for i in col_bin:
multi_index.extend([i,i])
binary_.extend([0,1])
multi_index_ = list(zip(multi_index, binary_))
multi_index_values = []
for i in df_pie:
item = list(zip(i["count"].tolist(), i["percentage"].tolist()))
multi_index_values.extend(item)
multi_index__ = pd.MultiIndex.from_tuples(multi_index_, name=["Machinery", "isFailure"])
df_pie_ = pd.DataFrame(multi_index_values, index=multi_index__)
df_pie_.rename(columns={0:"count", 1:"percentage(%)"}, inplace=True)
df_pie_.T
| Machinery | Machine failure | TWF | HDF | PWF | OSF | RNF | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| isFailure | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| count | 9661.00 | 339.00 | 9954.00 | 46.00 | 9885.00 | 115.00 | 9905.00 | 95.00 | 9902.00 | 98.00 | 9981.00 | 19.00 |
| percentage(%) | 96.61 | 3.39 | 99.54 | 0.46 | 98.85 | 1.15 | 99.05 | 0.95 | 99.02 | 0.98 | 99.81 | 0.19 |